We use the dscr package to perform dynamic comparisons for the DE (differential expression) methods on simulated RNA-seq data (simulation based on GTEx RNA-seq dataset).

Methods

The methods for comparision include:

Simulation settings

We perform the simulations using the dscr package and then run DE analysis to test/estimate the effect size between two groups. The sample size for each of the two groups is 2, 4 or 10. The non-zero effect distribution has different shapes: Densities of non-zero effects used in simulations.

The codes for simulation are in directory dsc-poisthin-indep (simulate RNA-seq data with independent genes) and dsc-poisthin-dep (simulate RNA-seq data with dependent genes).

Run the simulations by the following commands and the results will be saved in dsc-poisthin-indep/res.Rdata and dsc-poisthin-dep/res.Rdata.

library(dscr)

# run simulations with independent genes
source("../dsc-poisthin-indep/run_dsc.R")

# run simulations with dependent genes
source("../dsc-poisthin-dep/run_dsc.R")

Results

Simulations with independent genes

library(reshape2)
library(ggplot2)
library(dplyr)
library(tidyr)

load("../dsc-poisthin-indep/res.RData")
resscore = separate(res$score,scenario,c("scenario","nsamp"),",nsamp=")
resscore$nsamp = paste0("N=",resscore$nsamp)
resscore$scenario = factor(resscore$scenario,
                           levels=c("spiky","near_normal","flat_top","big-normal","bimodal"))

resscore$method[resscore$method %in% c("DESeq2","edgeR")] = 
  paste0(resscore$method[resscore$method %in% c("DESeq2","edgeR")],"+qval")
resscore2 = filter(resscore, method %in% c("DESeq2+qval","VL+eBayes+qval","VL+pval2se+ash",
                                           "VL+eBayes+ash", "VL+eBayes+ash.alpha=1"))
resscore = filter(resscore, method %in% c("VL+eBayes+qval", "VL+ash","VL+pval2se+ash",
                                          "VL+eBayes+ash", "VL+eBayes+ash.alpha=1"))

We plot the estimated \(\pi_0\) (null proportion) vs true \(\pi_0\) for different scenarios.

pi0_plot = ggplot(resscore, aes(pi0,pi0.est,colour=method)) +geom_point(shape=1) +
  facet_grid(nsamp ~ scenario) + 
  guides(alpha=FALSE) +
  geom_abline(colour = "black") +
  xlab("True pi0") +
  ylab("Estimated pi0") 
print(pi0_plot +scale_y_continuous(limits=c(0,1)) +
        scale_x_continuous(limits=c(0,1)) +
        coord_equal(ratio=1) + 
        guides(fill=guide_legend(nrow=2))+
        theme(legend.position = "top",axis.text.x = element_text(size = 8,angle=45)))

The actual false discovery proportions if declaring genes with q-values under 0.05 as positives:

fdp_plot=ggplot(resscore,
                aes(pi0,FDP_005,colour=method)) +geom_point(shape=1) +
  facet_grid(nsamp ~ scenario) + 
  guides(alpha=FALSE) +
  geom_abline(slope=0,intercept=0.05,colour = "black") +
  xlab("True pi0") +
  ylab("False discovery proportion when q=0.05") 
print(fdp_plot +scale_y_continuous(limits=c(0,1)) +
        scale_x_continuous(limits=c(0,1)) +
        coord_equal(ratio=1) + 
        theme(legend.position = "top",axis.text.x = element_text(size = 8,angle=45)))

The proportion of discoveries if declaring genes with q-values under 0.05 as positives:

dp_plot=ggplot(resscore,
                aes(pi0,DP_005,colour=method)) +geom_point(shape=1) +
  facet_grid(nsamp ~ scenario) + 
  guides(alpha=FALSE) +
  geom_abline(slope=-1,intercept=1,colour = "black") +
  xlab("True pi0") +
  ylab("Discovery proportion when q=0.05") 
print(dp_plot +scale_y_continuous(limits=c(0,1)) +
        scale_x_continuous(limits=c(0,1)) +
        coord_equal(ratio=1) + 
        theme(legend.position = "top",axis.text.x = element_text(size = 8,angle=45)))

RRMSE (relative root mean squared error) of effect estimates. We choose VL as the baseline level, and divide the RRMSE’s of the other methods by that of VL.

newres = resscore2
newres = newres[newres$method=="VL+eBayes+qval",]
newres = cbind(newres[,2:4],newres$rmse.beta)
names(newres)[4] = "rmse.beta_vleq"
test = left_join(resscore2,newres,by=c("seed","scenario","nsamp"))
test$rmse.beta_rel = test$rmse.beta/test$rmse.beta_vleq 
test$method[test$method=="VL+eBayes+qval"] = "VL"
test$method[test$method=="DESeq2+qval"] = "DESeq2"

rmse_plot=ggplot(test,
                 aes(pi0,rmse.beta_rel,colour=method)) +geom_point(shape=1) +
  facet_grid(nsamp ~ scenario) + 
  guides(alpha=FALSE) +
  geom_abline(slope=0,intercept=0,colour = "black") +
  xlab("True pi0") +
  ylab("Relative RMSE of effect estimates") 
print(rmse_plot +scale_y_continuous(limits=c(0,1.2)) +
        scale_x_continuous(limits=c(0,1)) +
        coord_equal(ratio=1) + 
        theme(legend.position = "top",axis.text.x = element_text(size = 8,angle=45)))

Coverage tables for all observations, “significant” negative discoveries and “significant” positive discoveries respectively:

# Coverage tables
coverthresh = 0.05 # threshold at which we look at coverage
findthresh=0.95 #threshold at we define a discovery significant

neglong = 
  res$negprob %>% 
  select(-user.self,-sys.self,-elapsed,-user.child,-sys.child) %>%
  melt(id.vars=c("method","scenario","seed",".id"),value.name="negprob") %>%
  filter(negprob > findthresh) %>%
  filter(method=="voom+vash+ash")

poslong = 
  res$posprob %>% 
  select(-user.self,-sys.self,-elapsed,-user.child,-sys.child) %>%
  melt(id.vars=c("method","scenario","seed",".id"),value.name="posprob") %>%
  filter(posprob > findthresh) %>%
  filter(method=="voom+vash+ash")

reslong = 
  res$cdf_score %>% 
  select(-user.self,-sys.self,-elapsed,-user.child,-sys.child) %>%    
  melt(id.vars=c("method","scenario","seed",".id")) %>%
  filter(method=="voom+vash+ash")

reslong.pos = inner_join(reslong,poslong)
reslong.neg = inner_join(reslong,neglong)

xtabs(lt~as.numeric(nsamp)+scenario,reslong %>% group_by(scenario,method) %>% 
        summarize(lt = 1-mean(value<coverthresh)) %>% 
        separate(scenario,c("scenario","nsamp"),",nsamp=")) %>% round(2)
##                  scenario
## as.numeric(nsamp) big-normal bimodal flat_top near_normal spiky
##                2        0.80    0.93     0.96        0.91  0.93
##                4        0.93    0.96     0.96        0.96  0.95
##                10       0.97    0.97     0.96        0.96  0.95
xtabs(lt~as.numeric(nsamp)+scenario,reslong.pos %>% group_by(scenario,method) %>% 
        summarize(lt = 1-mean(value<coverthresh)) %>% 
        separate(scenario,c("scenario","nsamp"),",nsamp=")) %>% round(2)
##                  scenario
## as.numeric(nsamp) big-normal bimodal flat_top near_normal spiky
##                2        0.98    0.96     0.96        0.96  0.96
##                4        0.96    0.96     0.96        0.96  0.96
##                10       0.95    0.96     0.95        0.96  0.96
xtabs(lt~as.numeric(nsamp)+scenario,reslong.neg %>% group_by(scenario,method) %>% 
        summarize(lt = 1-mean(value<coverthresh)) %>% 
        separate(scenario,c("scenario","nsamp"),",nsamp=")) %>% round(2)
##                  scenario
## as.numeric(nsamp) big-normal bimodal flat_top near_normal spiky
##                2        0.23    0.75     0.94        0.65  0.74
##                4        0.76    0.95     0.94        0.95  0.95
##                10       0.95    0.94     0.94        0.95  0.94

Simulations with independent genes

We compare the results similarly:

rm(list=ls())
load("../dsc-poisthin-dep/res.RData")

resscore = separate(res$score,scenario,c("scenario","nsamp"),",nsamp=")
resscore$nsamp = paste0("N=",resscore$nsamp)
resscore$scenario = factor(resscore$scenario,
                           levels=c("spiky","near_normal","flat_top","big-normal","bimodal"))

resscore$method[resscore$method %in% c("DESeq2","edgeR")] =
  paste0(resscore$method[resscore$method %in% c("DESeq2","edgeR")],"+qval")
resscore2 = filter(resscore, method %in% 
                     c("VL+eBayes+ash","VL+eBayes+ash+inflate",
                       "VL+eBayes+ash+inflate.ctl","mouthwash"))
resscore1 = filter(resscore, method %in% 
                     c("DESeq2+qval","edgeR+qval","VL+eBayes+qval",
                       "VL+eBayes+ash","VL+eBayes+ash.alpha=1"))

# True vs estimated pi0
pi0_plot=ggplot(resscore1,
                aes(pi0,pi0.est,colour=method)) +geom_point(shape=1) +
  facet_grid(nsamp ~ scenario) + 
  guides(alpha=FALSE) +
  geom_abline(colour = "black") +
  xlab("True pi0") +
  ylab("Estimated pi0") 
print(pi0_plot +scale_y_continuous(limits=c(0,1)) +
        scale_x_continuous(limits=c(0,1)) +
        coord_equal(ratio=1) + 
        theme(legend.position = "top",axis.text.x = element_text(size = 8,angle=45)))

# Actual FDP when q=0.05
fdp_plot=ggplot(resscore1,
                aes(pi0,FDP_005,colour=method)) +geom_point(shape=1) +
  facet_grid(nsamp ~ scenario) + 
  guides(alpha=FALSE) +
  geom_abline(slope=0,intercept=0.05,colour = "black") +
  xlab("True pi0") +
  ylab("False discovery proportion when q=0.05") 
print(fdp_plot +scale_y_continuous(limits=c(0,1)) +
        scale_x_continuous(limits=c(0,1)) +
        coord_equal(ratio=1) + 
        theme(legend.position = "top",axis.text.x = element_text(size = 8,angle=45)))

# RRMSE
newres = resscore1
newres = newres[newres$method=="VL+eBayes+qval",]
newres = cbind(newres[,2:4],newres$rmse.beta)
names(newres)[4] = "rmse.beta_vleq"
test = left_join(resscore1,newres,by=c("seed","scenario","nsamp"))
test$rmse.beta_rel = test$rmse.beta/test$rmse.beta_vleq 
test$method[test$method=="VL+eBayes+qval"] = "VL"
test$method[test$method=="DESeq2+qval"] = "DESeq2"
test$method[test$method=="edgeR+qval"] = "edgeR"

rmse_plot=ggplot(test, aes(pi0,rmse.beta_rel,colour=method)) +geom_point(shape=1) +
  facet_grid(nsamp ~ scenario) + 
  guides(alpha=FALSE) +
  geom_abline(slope=0,intercept=0,colour = "black") +
  xlab("True pi0") +
  ylab("Relative RMSE of effect estimates") 
print(rmse_plot +scale_y_continuous(limits=c(0,1.3)) +
        scale_x_continuous(limits=c(0,1)) +
        coord_equal(ratio=1) + 
        theme(legend.position = "top",axis.text.x = element_text(size = 8,angle=45)))

# Coverage tables
coverthresh = 0.05 # threshold at which we look at coverage
findthresh=0.95 #threshold at we define a discovery significant

neglong = 
  res$negprob %>% 
  select(-user.self,-sys.self,-elapsed,-user.child,-sys.child) %>%
  melt(id.vars=c("method","scenario","seed",".id"),value.name="negprob") %>%
  filter(negprob > findthresh) %>%
  filter(method=="voom+vash+ash")

poslong = 
  res$posprob %>% 
  select(-user.self,-sys.self,-elapsed,-user.child,-sys.child) %>%
  melt(id.vars=c("method","scenario","seed",".id"),value.name="posprob") %>%
  filter(posprob > findthresh) %>%
  filter(method=="voom+vash+ash")

reslong = 
  res$cdf_score %>% 
  select(-user.self,-sys.self,-elapsed,-user.child,-sys.child) %>%    
  melt(id.vars=c("method","scenario","seed",".id")) %>%
  filter(method=="voom+vash+ash")

reslong.pos = inner_join(reslong,poslong)
reslong.neg = inner_join(reslong,neglong)
dim(reslong.pos)
## [1] 1203531       7
dim(reslong.neg)
## [1] 1209703       7
xtabs(lt~as.numeric(nsamp)+scenario,reslong %>% group_by(scenario,method) %>% 
        summarize(lt = 1-mean(value<coverthresh)) %>% 
        separate(scenario,c("scenario","nsamp"),",nsamp=")) %>% round(2)
##                  scenario
## as.numeric(nsamp) big-normal bimodal flat_top near_normal spiky
##                2        0.80    0.93     0.96        0.91  0.93
##                4        0.94    0.97     0.96        0.97  0.96
##                10       0.97    0.97     0.96        0.96  0.95
xtabs(lt~as.numeric(nsamp)+scenario,reslong.pos %>% group_by(scenario,method) %>% 
        summarize(lt = 1-mean(value<coverthresh)) %>%
        separate(scenario,c("scenario","nsamp"),",nsamp=")) %>% round(2)
##                  scenario
## as.numeric(nsamp) big-normal bimodal flat_top near_normal spiky
##                2        0.98    0.96     0.96        0.96  0.95
##                4        0.96    0.96     0.95        0.96  0.96
##                10       0.96    0.96     0.95        0.95  0.94
xtabs(lt~as.numeric(nsamp)+scenario,reslong.neg %>% group_by(scenario,method) %>% 
        summarize(lt = 1-mean(value<coverthresh)) %>% 
        separate(scenario,c("scenario","nsamp"),",nsamp=")) %>% round(2)
##                  scenario
## as.numeric(nsamp) big-normal bimodal flat_top near_normal spiky
##                2        0.24    0.76     0.94        0.66  0.76
##                4        0.76    0.95     0.94        0.95  0.95
##                10       0.95    0.94     0.94        0.95  0.95

Session information

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.3
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] bindrcpp_0.2.2 tidyr_0.8.1    dplyr_0.7.6    ggplot2_3.0.0 
## [5] reshape2_1.4.3
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.18     rstudioapi_0.7   bindr_0.1.1      knitr_1.20      
##  [5] magrittr_1.5     tidyselect_0.2.4 munsell_0.5.0    colorspace_1.3-2
##  [9] R6_2.2.2         rlang_0.2.2      stringr_1.3.1    plyr_1.8.4      
## [13] tools_3.5.0      grid_3.5.0       gtable_0.2.0     withr_2.1.2     
## [17] htmltools_0.3.6  assertthat_0.2.0 yaml_2.2.0       lazyeval_0.2.1  
## [21] rprojroot_1.3-2  digest_0.6.16    tibble_1.4.2     crayon_1.3.4    
## [25] purrr_0.2.5      glue_1.3.0       evaluate_0.11    rmarkdown_1.10  
## [29] labeling_0.3     stringi_1.2.4    compiler_3.5.0   pillar_1.3.0    
## [33] scales_1.0.0     backports_1.1.2  pkgconfig_2.0.2